This is a project on conducting Cohort and Customer Churn Analysis on online retail sales data. Online Sales data was collected between 01/12/2010 and 09/12/2011 for a UK-based online retail store. We will examine the retention rate of customers over a 13 month period. First we remove the N/A’s from our data set and then use the “cohorts” package in R to create a cohort table. Then we will use a line graph and tile chart to visualize the 13 cohorts over the months.

Original dataset: https://www.kaggle.com/datasets/ersany/online-retail-dataset

Install Packages

install.packages('tidyverse', repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Steve\AppData\Local\Temp\RtmpeCCFDj\downloaded_packages

install.packages('readxl', repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'readxl' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'readxl'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\readxl\libs\x64\readxl.dll
## to C:\Users\Steve\AppData\Local\R\win-library\4.2\readxl\libs\x64\readxl.dll:
## Permission denied

## Warning: restored 'readxl'

## 
## The downloaded binary packages are in
##  C:\Users\Steve\AppData\Local\Temp\RtmpeCCFDj\downloaded_packages

install.packages('cohorts', repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'cohorts' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Steve\AppData\Local\Temp\RtmpeCCFDj\downloaded_packages

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(readxl)
library(cohorts)

Read Data frame

df <- read_xlsx('Online Retail.xlsx')

Inspect the data

str(df)

## tibble [541,909 × 8] (S3: tbl_df/tbl/data.frame)
##  $ InvoiceNo  : chr [1:541909] "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: POSIXct[1:541909], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
##  $ UnitPrice  : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
##  $ Country    : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...

summary(df)

##   InvoiceNo          StockCode         Description           Quantity        
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     1.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.00  
##                                                           Mean   :     9.55  
##                                                           3rd Qu.:    10.00  
##                                                           Max.   : 80995.00  
##                                                                              
##   InvoiceDate                       UnitPrice           CustomerID    
##  Min.   :2010-12-01 08:26:00.00   Min.   :-11062.06   Min.   :12346   
##  1st Qu.:2011-03-28 11:34:00.00   1st Qu.:     1.25   1st Qu.:13953   
##  Median :2011-07-19 17:17:00.00   Median :     2.08   Median :15152   
##  Mean   :2011-07-04 13:34:57.16   Mean   :     4.61   Mean   :15288   
##  3rd Qu.:2011-10-19 11:27:00.00   3rd Qu.:     4.13   3rd Qu.:16791   
##  Max.   :2011-12-09 12:50:00.00   Max.   : 38970.00   Max.   :18287   
##                                                       NA's   :135080  
##    Country         
##  Length:541909     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Remove rows containing NA’s/Nulls in the CustomerID column and convert date format

df1 <- na.omit(df) 
df1$YMD <- as.Date(df1$InvoiceDate)

Create new dataframe with CustomerID and Date

cohort_data <- df1 %>% select(CustomerID, YMD)

Create a cohort table by month using cohort_table_month function

cohort_data %>% cohort_table_month(CustomerID, YMD)

## # A tibble: 13 × 14
##    cohort `Dec 2010` `Jan 2011` Feb 20…¹ Mar 2…² Apr 2…³ May 2…⁴ Jun 2…⁵ Jul 2…⁶
##     <int>      <int>      <int>    <int>   <int>   <int>   <int>   <int>   <int>
##  1      1        948        362      317     367     341     376     360     336
##  2      2         NA        421      101     119     102     138     126     110
##  3      3         NA         NA      380      94      73     106     102      94
##  4      4         NA         NA       NA     440      84     112      96     102
##  5      5         NA         NA       NA      NA     299      68      66      63
##  6      6         NA         NA       NA      NA      NA     279      66      48
##  7      7         NA         NA       NA      NA      NA      NA     235      49
##  8      8         NA         NA       NA      NA      NA      NA      NA     191
##  9      9         NA         NA       NA      NA      NA      NA      NA      NA
## 10     10         NA         NA       NA      NA      NA      NA      NA      NA
## 11     11         NA         NA       NA      NA      NA      NA      NA      NA
## 12     12         NA         NA       NA      NA      NA      NA      NA      NA
## 13     13         NA         NA       NA      NA      NA      NA      NA      NA
## # … with 5 more variables: `Aug 2011` <int>, `Sep 2011` <int>,
## #   `Oct 2011` <int>, `Nov 2011` <int>, `Dec 2011` <int>, and abbreviated
## #   variable names ¹`Feb 2011`, ²`Mar 2011`, ³`Apr 2011`, ⁴`May 2011`,
## #   ⁵`Jun 2011`, ⁶`Jul 2011`

Create a percentage cohort table by month

cohort_data %>% cohort_table_month(CustomerID, YMD) %>% cohort_table_pct()

## # A tibble: 13 × 14
##    cohort `Dec 2010` `Jan 2011` Feb 20…¹ Mar 2…² Apr 2…³ May 2…⁴ Jun 2…⁵ Jul 2…⁶
##     <int>      <dbl>      <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1      1        100       38.2     33.4    38.7    36      39.7    38      35.4
##  2      2         NA      100       24      28.3    24.2    32.8    29.9    26.1
##  3      3         NA       NA      100      24.7    19.2    27.9    26.8    24.7
##  4      4         NA       NA       NA     100      19.1    25.5    21.8    23.2
##  5      5         NA       NA       NA      NA     100      22.7    22.1    21.1
##  6      6         NA       NA       NA      NA      NA     100      23.7    17.2
##  7      7         NA       NA       NA      NA      NA      NA     100      20.9
##  8      8         NA       NA       NA      NA      NA      NA      NA     100  
##  9      9         NA       NA       NA      NA      NA      NA      NA      NA  
## 10     10         NA       NA       NA      NA      NA      NA      NA      NA  
## 11     11         NA       NA       NA      NA      NA      NA      NA      NA  
## 12     12         NA       NA       NA      NA      NA      NA      NA      NA  
## 13     13         NA       NA       NA      NA      NA      NA      NA      NA  
## # … with 5 more variables: `Aug 2011` <dbl>, `Sep 2011` <dbl>,
## #   `Oct 2011` <dbl>, `Nov 2011` <dbl>, `Dec 2011` <dbl>, and abbreviated
## #   variable names ¹`Feb 2011`, ²`Mar 2011`, ³`Apr 2011`, ⁴`May 2011`,
## #   ⁵`Jun 2011`, ⁶`Jul 2011`

Shift cohort table left to start at time, t=0

cohort_data %>% cohort_table_month(CustomerID, YMD) %>% cohort_table_pct() %>% shift_left()

## # A tibble: 13 × 14
##    cohort    t0    t1    t2    t3    t4    t5    t6    t7    t8    t9   t10
##     <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1      1   100  38.2  33.4  38.7  36    39.7  38    35.4  35.4  39.5  37.3
##  2      2   100  24    28.3  24.2  32.8  29.9  26.1  25.7  31.1  34.7  36.8
##  3      3   100  24.7  19.2  27.9  26.8  24.7  25.5  28.2  25.8  31.3   9.2
##  4      4   100  19.1  25.5  21.8  23.2  17.7  26.4  23.9  28.9   8.9   0  
##  5      5   100  22.7  22.1  21.1  20.7  23.7  23.1  26.1   8.4   0     0  
##  6      6   100  23.7  17.2  17.2  21.5  24.4  26.5  10.4   0     0     0  
##  7      7   100  20.9  18.7  27.2  24.7  33.6  10.2   0     0     0     0  
##  8      8   100  20.9  20.4  23    27.2  11.5   0     0     0     0     0  
##  9      9   100  25.1  25.1  25.1  13.8   0     0     0     0     0     0  
## 10     10   100  29.9  32.6  12.1   0     0     0     0     0     0     0  
## 11     11   100  26.4  13.1   0     0     0     0     0     0     0     0  
## 12     12   100  13.4   0     0     0     0     0     0     0     0     0  
## 13     13   100   0     0     0     0     0     0     0     0     0     0  
## # … with 2 more variables: t11 <dbl>, t12 <dbl>

Can also do the same thing with shift_left_pct() function

cohort_data %>% cohort_table_month(CustomerID, YMD) %>% shift_left_pct()

## # A tibble: 13 × 14
##    cohort    t0    t1    t2    t3    t4    t5    t6    t7    t8    t9   t10
##     <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1      1   100  38.2  33.4  38.7  36    39.7  38    35.4  35.4  39.5  37.3
##  2      2   100  24    28.3  24.2  32.8  29.9  26.1  25.7  31.1  34.7  36.8
##  3      3   100  24.7  19.2  27.9  26.8  24.7  25.5  28.2  25.8  31.3   9.2
##  4      4   100  19.1  25.5  21.8  23.2  17.7  26.4  23.9  28.9   8.9   0  
##  5      5   100  22.7  22.1  21.1  20.7  23.7  23.1  26.1   8.4   0     0  
##  6      6   100  23.7  17.2  17.2  21.5  24.4  26.5  10.4   0     0     0  
##  7      7   100  20.9  18.7  27.2  24.7  33.6  10.2   0     0     0     0  
##  8      8   100  20.9  20.4  23    27.2  11.5   0     0     0     0     0  
##  9      9   100  25.1  25.1  25.1  13.8   0     0     0     0     0     0  
## 10     10   100  29.9  32.6  12.1   0     0     0     0     0     0     0  
## 11     11   100  26.4  13.1   0     0     0     0     0     0     0     0  
## 12     12   100  13.4   0     0     0     0     0     0     0     0     0  
## 13     13   100   0     0     0     0     0     0     0     0     0     0  
## # … with 2 more variables: t11 <dbl>, t12 <dbl>

Next we use the above cohort table and pivot the time values to make a long data frame

Then we create a new column with the time as a numeric variable

cohort_data_long <- cohort_data  %>% cohort_table_month(CustomerID, YMD) %>% 
  shift_left_pct() %>% pivot_longer(-cohort) %>%  mutate(time=as.numeric(str_remove(name,"t")))

Create a line plot of the retention rate of each cohort across all the months

cohort_data_long %>%  
  filter(value > 0,cohort <=13, time>=0) %>% 
  ggplot(aes(time,value,colour=factor(cohort), group=cohort)) + 
  geom_line(size = 1.5) +
geom_point(size = 1.5) +
  theme_light() + labs(y="Retention Rate (%)", x="Month") + 
  ggtitle("Line Graph of Retention Rates") + theme(plot.title = element_text(hjust = 0.5))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

Cohort table plotted with tiles

cohort_data_long %>%
  filter(time >= 0, value > 0) %>%
  ggplot(aes(time, reorder(cohort, desc(cohort)))) +
  geom_raster(aes(fill = log(value))) +
  coord_equal(ratio = 1) +
  geom_text(aes(label = glue::glue("{round(value,0)}%")), 
            size  = 3, 
            colour = "snow") +
  scale_fill_gradient(guide = "none") +
  theme_minimal(base_size = 16) +
  theme(panel.grid   = element_blank(),
        panel.border = element_blank()) +
  labs(y = "Cohort",x="Month") + 
  ggtitle("Cohort Tile Chart for Retention Rate") + theme(plot.title = element_text(hjust = 0.5))

Cohort Analysis and Customer Churn On Online Retail Sales Data

Stephen Situ

2022-12-06